library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
## Attaching sp
library(patchwork)
library(Seurat)
library(devtools)
## Loading required package: usethis
The data were taken from PANGLAODB. It a sample of substanzia nigra form an adult donor.
note: sn stands for Substantia Nigra
load("/Users/carlomanenti/fasulla/SRA850958_SRS4386112.sparse.RData")
rownames(sm) <- sapply(strsplit(rownames(sm),"_"), `[`, 1)
sn.data <- sm
sn <- CreateSeuratObject(counts = sn.data, project = "snsc", min.cells = 3, min.features = 200)
## Warning: Non-unique features (rownames) present in the input matrix, making
## unique
grep("^MT-",rownames(sn),value = TRUE)
## [1] "MT-ATP6" "MT-ATP8" "MT-CO1" "MT-CO2" "MT-CO3" "MT-CYB" "MT-ND1"
## [8] "MT-ND2" "MT-ND3" "MT-ND4" "MT-ND4L" "MT-ND5" "MT-ND6" "MT-RNR1"
## [15] "MT-RNR2" "MT-TC" "MT-TL1" "MT-TP"
sn[["percent.mt"]] <- PercentageFeatureSet(sn, pattern = "^MT-")
grep("^RP[LS]",rownames(sn),value = TRUE)
sn[["percent.rbp"]] <- PercentageFeatureSet(sn, pattern = "^RP[LS]")
VlnPlot(sn, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.rbp"), ncol = 4)
VlnPlot(sn, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.rbp"), ncol = 4, pt.size=0)
In this case the number of genes must be between 200 and 3000 and the mithocondrial DNA/RNA percentage must be lower than 5%
sn
## An object of class Seurat
## 27364 features across 7055 samples within 1 assay
## Active assay: RNA (27364 features, 0 variable features)
sn <- subset(sn, subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent.mt < 5)
sn
## An object of class Seurat
## 27364 features across 6434 samples within 1 assay
## Active assay: RNA (27364 features, 0 variable features)
sn <- NormalizeData(sn, normalization.method = "LogNormalize", scale.factor = 10000)
Seurat contains a pre-computed list of cell cycle specific genes that can be used to guess which CC phase the cell is in Since our sample are brain cells this probably won’t be too useful.
sn <- CellCycleScoring(sn, s.features = cc.genes.updated.2019$s.genes, g2m.features = cc.genes.updated.2019$g2m.genes, set.ident = TRUE)
## Warning: The following features are not present in the object: UNG, DTL, UHRF1,
## UBR7, EXO1, USP1, E2F8, not searching for symbol synonyms
## Warning: The following features are not present in the object: UBE2C, NDC80,
## MKI67, CKAP2L, AURKB, HJURP, CDC25C, DLGAP5, CENPA, not searching for symbol
## synonyms
sn[[]]
We keep a subset of genes with the greathest variability of expression across cells
sn <- FindVariableFeatures(sn, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(sn), 10)
plot1 <- VariableFeaturePlot(sn)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 18 rows containing missing values (geom_point).
We shift the expression of each gene so that the mean is 0 and the variance is 1 across cells
all.genes <- rownames(sn)
sn <- ScaleData(sn, features = all.genes)
## Centering and scaling data matrix
We perform PCA on the 2000 most variable genes
sn <- RunPCA(sn, features = VariableFeatures(object = sn))
## PC_ 1
## Positive: SPARCL1, ATP1A2, NDUFA4L2, RGS5, IFITM3, EPAS1, TSC22D1, GFAP, TIMP3, MT2A
## IFITM1, ABCG2, ITIH5, VIM, CLDN5, FLT1, IGFBP7, GNG11, IFI27, ITM2C
## HLA-E, HIGD1B, COBLL1, DCN, ATP1B2, ARHGAP29, CAVIN2, RBMS3, EGFL7, CA4
## Negative: PLP1, MBP, CNP, CRYAB, MAG, APLP1, ERMN, MOBP, CLDN11, TMEM144
## TF, SPP1, GPM6B, FEZ1, TUBA1A, SOX2-OT, CLDND1, PIP4K2A, SLC44A1, ENPP2
## SCD, QDPR, ATP1B1, ST18, RAPGEF5, KLK6, ANLN, S100B, STMN1, CNDP1
## PC_ 2
## Positive: SLC14A1, FGFR3, IQCA1, AQP4, AQP1, PTPRZ1, RYR3, SLC1A2, SLC6A11, GJA1
## SYNE1, NTRK2, RGMA, CAMK2G, MIAT, SDC4, TNC, BCAN, FOS, TRPM3
## APOE, ETNPPL, ID2, EFEMP1, AEBP1, AGT, GPM6A, PFKP, ID4, FAT3
## Negative: NDUFA4L2, RGS5, IFITM1, EPAS1, DCN, ITIH5, VIM, ABCG2, FLT1, COBLL1
## GNG11, MYL9, CAVIN2, CLDN5, ADGRF5, IGFBP7, CA4, ABCB1, HIGD1B, NOTCH3
## ARHGAP29, RERGL, SLC6A12, ESAM, EGFL7, SLC38A5, IFITM3, IFI27, B2M, HERC2P3
## PC_ 3
## Positive: MEG3, SYNPR, SYT1, HGNC:24955, SCG2, GAD2, SLC12A5, GRIK1, GAD1, CELF4
## ZNF385D, LRRC4C, GPM6A, KCNIP4-IT1, THY1, VSTM2L, ATP1A3, FGF14-IT1, KCNJ6, SNAP25
## SCN3B, SYN2, SLC8A1, PTPRN, SPHKAP, SLC4A10, GRIP2, SV2A, RAB3C, TSPAN7
## Negative: CD74, LPAR6, CSF3R, RNA5SP151, ADAM28, ITGAX, CSF1R, KCNMB1, C3, HLA-DRA
## A2M, MED12L, RNASET2, CX3CR1, GFAP, P2RY13, CLEC7A, LAPTM5, HLA-E, C1QB
## SLCO2B1, INPP5D, ADAP2, C1QC, LST1, PTPRC, SERPINB9, SLC14A1, SAMSN1, HLA-DRB1
## PC_ 4
## Positive: CD74, CSF3R, RNA5SP151, LPAR6, ADAM28, KCNMB1, CSF1R, ITGAX, HLA-DRA, MED12L
## CX3CR1, SAT1, ARHGAP24, C1QB, RNASET2, P2RY13, CLEC7A, LAPTM5, SRGAP2, C1QC
## MAF, SLCO2B1, C3, PTPRC, ADAP2, MEF2C, TMEM52B, LST1, TMC8, TNFRSF13C
## Negative: MT2A, SLC14A1, SYNE1, PTGDS, AQP4, GJA1, IQCA1, FGFR3, AQP1, ATP1A2
## MT1E, GFAP, MT1X, SDC4, AGT, RYR3, NTRK2, RGMA, ATP1B2, SLC6A11
## SORBS1, ETNPPL, MT2P1, TNC, MT1G, AEBP1, SLC1A2, CAMK2G, SPARCL1, EDNRB
## PC_ 5
## Positive: SYNPR, SYT1, GAD2, SLC12A5, VSTM2L, CELF4, GRIP2, SCN3B, CAMK2B, OTX2
## GABBR2, SYN2, PTPRN, GRIN1, RAB3B, PGM2L1, SNCB, TAC1, ZNF385D, KIT
## AC016717.2, SLC14A1, GUCY1A3, SYT7, IGSF3, NRGN, IQCA1, SNAP25, RAB3A, ATP2B2
## Negative: VCAN, B3GNT7, TNR, PDGFRA, PTPRZ1, COL9A1, CRISPLD2, LRRC4C, DSCAM, BCAN
## MMP16, MEG3, PLEKHH2, NEU4, PCDH15, PHLDA1, GPR17, ASCL1, FERMT1, CSPG5
## SLC35F1, SEMA5A, LUZP2, SOX6, SNTG1, CSPG4, OLIG2, SULF2, CD82, MARCKS
print(sn[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: SPARCL1, ATP1A2, NDUFA4L2, RGS5, IFITM3
## Negative: PLP1, MBP, CNP, CRYAB, MAG
## PC_ 2
## Positive: SLC14A1, FGFR3, IQCA1, AQP4, AQP1
## Negative: NDUFA4L2, RGS5, IFITM1, EPAS1, DCN
## PC_ 3
## Positive: MEG3, SYNPR, SYT1, HGNC:24955, SCG2
## Negative: CD74, LPAR6, CSF3R, RNA5SP151, ADAM28
## PC_ 4
## Positive: CD74, CSF3R, RNA5SP151, LPAR6, ADAM28
## Negative: MT2A, SLC14A1, SYNE1, PTGDS, AQP4
## PC_ 5
## Positive: SYNPR, SYT1, GAD2, SLC12A5, VSTM2L
## Negative: VCAN, B3GNT7, TNR, PDGFRA, PTPRZ1
VizDimLoadings(sn, dims = 1:2, reduction = "pca")
VizDimLoadings(sn, dims = 3:4, reduction = "pca")
VizDimLoadings(sn, dims = 4:5, reduction = "pca")
We project the cells in the first two principal components, the cells are coluored according to their cell cycle phase
DimPlot(sn, reduction = "pca")
The cell cycle seems to be ininfluent!
With ndims we can choose how many PC to plot
ElbowPlot(sn, ndims= 30)
Choosing how many dimensions to use can vary depending on the method we choose. In general it’s better to keep all PC until 70/75% of the variance is explained
pc.touse <- (sn$pca@stdev)^2
pc.touse <- pc.touse/sum(pc.touse)
pc.touse <- cumsum(pc.touse)[1:50]
pc.touse <- min(which(pc.touse>=0.85))
pc.touse
## [1] 27
In our case we opted for 11 PCs (standard deviation lower than 2) components and 20 PCs (At least 80% of the variance is explained).
sn.11.5 <- FindNeighbors(sn, dims = 1:11)
## Computing nearest neighbor graph
## Computing SNN
To cluster the cell we use the FindClusters function, which uses the Louvain algorithm to iteratively group cells together
sn.11.5 <- FindClusters(sn.11.5, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6434
## Number of edges: 227954
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9219
## Number of communities: 11
## Elapsed time: 0 seconds
sn.11.5 <- sn.11.5
We plot the clusters using TSNE
sn.11.5.tsne <- RunTSNE(sn.11.5, dims=1:11)
DimPlot(sn.11.5.tsne, reduction = "tsne")
## Uniform Manifold Approximation and Projection(UMAP), which is
generally preferred but requires the installation of a new package
sn.11.5.UMAP <- RunUMAP(sn.11.5, dims = 1:11)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 12:33:50 UMAP embedding parameters a = 0.9922 b = 1.112
## 12:33:50 Read 6434 rows and found 11 numeric columns
## 12:33:50 Using Annoy for neighbor search, n_neighbors = 30
## 12:33:50 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:33:50 Writing NN index file to temp file /var/folders/nx/9kjdn6x158927qbzy599pcg00000gn/T//RtmpGUpfMS/file1c09126c25a8
## 12:33:50 Searching Annoy index using 1 thread, search_k = 3000
## 12:33:52 Annoy recall = 100%
## 12:33:52 Commencing smooth kNN distance calibration using 1 thread
## 12:33:52 Initializing from normalized Laplacian + noise
## 12:33:53 Commencing optimization for 500 epochs, with 263510 positive edges
## 12:34:05 Optimization finished
DimPlot(sn.11.5.UMAP, reduction = "umap")
VlnPlot(sn.11.5.UMAP,features="nCount_RNA")
VlnPlot(sn.11.5.UMAP,features="nFeature_RNA")
VlnPlot(sn.11.5.UMAP,features="percent.mt")
VlnPlot(sn.11.5.UMAP,features="percent.rbp")
library(ggplot2)
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
sn.11.5@meta.data %>%
group_by(seurat_clusters,Phase) %>%
count() %>%
group_by(seurat_clusters) %>%
mutate(percent=100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=Phase)) +
geom_col() +
ggtitle("Percentage of cell cycle phases per cluster")
All in all there is quite some mt DNA in cluster 6 and 7 that may lead to probelms in indetifying marker genes. For all the other quality check the clustering seems reasonable.
Repeating the analysis for 27 PCs
sn.27.5 <- FindNeighbors(sn, dims = 1:27)
## Computing nearest neighbor graph
## Computing SNN
To cluster the cell we use the FindClusters function, which uses the Louvain algorithm to iteratively group cells together
sn.27.5 <- FindClusters(sn.27.5, resolution = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6434
## Number of edges: 299775
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8556
## Number of communities: 13
## Elapsed time: 0 seconds
sn.27.5 <- sn.27.5
We plot the clusters using TSNE
sn.27.5.tsne <- RunTSNE(sn.27.5, dims=1:27)
DimPlot(sn.27.5.tsne, reduction = "tsne")
## Uniform Manifold Approximation and Projection(UMAP), which is
generally preferred but requires the installation of a new package
sn.27.5.UMAP <- RunUMAP(sn.27.5, dims = 1:27)
## 12:34:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 12:34:26 Read 6434 rows and found 27 numeric columns
## 12:34:26 Using Annoy for neighbor search, n_neighbors = 30
## 12:34:26 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:34:27 Writing NN index file to temp file /var/folders/nx/9kjdn6x158927qbzy599pcg00000gn/T//RtmpGUpfMS/file1c093548d44c
## 12:34:27 Searching Annoy index using 1 thread, search_k = 3000
## 12:34:28 Annoy recall = 100%
## 12:34:28 Commencing smooth kNN distance calibration using 1 thread
## 12:34:29 Initializing from normalized Laplacian + noise
## 12:34:29 Commencing optimization for 500 epochs, with 287734 positive edges
## 12:34:42 Optimization finished
DimPlot(sn.27.5.UMAP, reduction = "umap")
VlnPlot(sn.27.5.UMAP,features="nCount_RNA")
VlnPlot(sn.27.5.UMAP,features="nFeature_RNA")
VlnPlot(sn.27.5.UMAP,features="percent.mt")
VlnPlot(sn.27.5.UMAP,features="percent.rbp")
## or the cell cycle
library(ggplot2)
library(dbplyr)
sn.27.5@meta.data %>%
group_by(seurat_clusters,Phase) %>%
count() %>%
group_by(seurat_clusters) %>%
mutate(percent=100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=Phase)) +
geom_col() +
ggtitle("Percentage of cell cycle phases per cluster")
Having a higher resolution leads to a higher number of clusters. And in
this case the definition of clusters is impacted also by using many PCs
to explain the vast majority of variance. This way we probably ended up
defining the cluster also by noise and in a pure techinical way than a
meaningful way.
## **Overlapping**
library(ggplot2)
pbmc16_02 <- sn.11.5.UMAP
pbmc20_001 <- sn.27.5.UMAP
Clusters_1 <- 10
CLusters_2 <- 12
overlap <- data.frame(cluster_res1 = NA, cluster_res2 = NA, number_cells_common =NA)
# with PC=16 and resolution=0.2 I obtain clusters from 0 to 11, with PC=20 and resolution=0.01 I obtain clusters from 0 to 6
for (i in 0:Clusters_1) {
for (j in 0:CLusters_2) {
overlap <- rbind(overlap, c(i,j,length(intersect(rownames(as.data.frame(Idents(pbmc16_02)[Idents(pbmc16_02) == i])),rownames(as.data.frame(Idents(pbmc20_001)[Idents(pbmc20_001) == j]))))))
}
}
overlap <- overlap[c(-1),]
# dotplot
ggplot(overlap, aes(x= cluster_res1, y= cluster_res2, colour=number_cells_common, size=number_cells_common)) + geom_point() + xlab('Clustering 1 (11 PCs and 0.5 resolution)') + ylab('Clustering 2 (27 PCs and 1 resolution') + ggtitle('Overlapping cells') +
theme_classic() + scale_x_continuous(breaks = 0:12) + scale_y_continuous(breaks = 0:12)
The evalutation of overlapping cells in each cluster further asses the
problem of using to many PCs and a too high resolution, since many
clusters are probably split only due to noise.
Number of cell for each cluster
#
cat('11PCs and 0.5 resolution')
## 11PCs and 0.5 resolution
for (cluster_n in seq(0, Clusters_1 )){
n_cells<- length(Idents(sn.11.5.UMAP)[Idents(pbmc20_001) == cluster_n])
cat('Cluster', cluster_n, 'is composed of', n_cells, 'cells \n')
}
## Cluster 0 is composed of 932 cells
## Cluster 1 is composed of 784 cells
## Cluster 2 is composed of 747 cells
## Cluster 3 is composed of 708 cells
## Cluster 4 is composed of 649 cells
## Cluster 5 is composed of 644 cells
## Cluster 6 is composed of 622 cells
## Cluster 7 is composed of 485 cells
## Cluster 8 is composed of 291 cells
## Cluster 9 is composed of 208 cells
## Cluster 10 is composed of 200 cells
cat(' \n \n \n')
cat('27PCs and 1 resolution')
## 27PCs and 1 resolution
for (cluster_n in seq(0, CLusters_2 )){
n_cells<- length(Idents(sn.27.5.UMAP)[Idents(pbmc20_001) == cluster_n])
cat('Cluster', cluster_n, 'is composed of', n_cells, 'cells \n')
}
## Cluster 0 is composed of 932 cells
## Cluster 1 is composed of 784 cells
## Cluster 2 is composed of 747 cells
## Cluster 3 is composed of 708 cells
## Cluster 4 is composed of 649 cells
## Cluster 5 is composed of 644 cells
## Cluster 6 is composed of 622 cells
## Cluster 7 is composed of 485 cells
## Cluster 8 is composed of 291 cells
## Cluster 9 is composed of 208 cells
## Cluster 10 is composed of 200 cells
## Cluster 11 is composed of 124 cells
## Cluster 12 is composed of 40 cells
Also the number of cell for each cluster is much more variable using 27PCs and 1 as resolution. This may also contribute to meaningless clusters.
Seurat also includes a function that can be used to find genes over expressed between two clusters or overexpressed in one cluster with respect to all the others
The one vs all clusters analyses can be iterated automatically
sn.11.5 <- sn.11.5.UMAP
sn.markers <- FindAllMarkers(sn.11.5, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
sn.markers %>%
group_by(cluster) %>%
slice_max(n = 5, order_by = avg_log2FC)
## # A tibble: 55 × 7
## # Groups: cluster [11]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 2.76e-199 2.41 0.631 0.325 7.56e-195 0 STMN4
## 2 0 2.40 0.937 0.615 0 0 S100B
## 3 0 2.28 0.974 0.564 0 0 CRYAB
## 4 0 2.19 0.889 0.515 0 0 TUBA1A
## 5 6.77e-250 2.16 0.74 0.413 1.85e-245 0 STMN1
## 6 0 2.96 0.707 0.146 0 1 SYNE1
## 7 0 2.90 0.547 0.062 0 1 FGFR3
## 8 0 2.83 0.564 0.064 0 1 SLC14A1
## 9 0 2.77 0.488 0.053 0 1 IQCA1
## 10 0 2.74 0.445 0.036 0 1 SLC6A11
## # … with 45 more rows
sn.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(sn.11.5, features = top10$gene) + NoLegend()
Looking at the Heatmap we can see that clusters 2 and 3 show quite a lot
of similarities. Also clusters 1 and 8 are similar and clusters 5 and
10.
We want to asses the cell type shared by cluster 0, 2 and 3.
cluster0AND2AND3.markers <- FindMarkers(sn.11.5, ident.1 = c(0,2,3), min.pct = 0.25, test.use = "wilcox")
cluster0AND2AND3.markers <- cluster0AND2AND3.markers[order(-cluster0AND2AND3.markers$avg_log2FC), ]
head(cluster0AND2AND3.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## CRYAB 0 3.238763 0.985 0.397 0
## MAG 0 3.142866 0.847 0.152 0
## CNP 0 3.005078 0.912 0.275 0
## ERMN 0 2.960278 0.814 0.128 0
## PLP1 0 2.952570 0.996 0.594 0
## CLDN11 0 2.852663 0.674 0.134 0
## TUBA1A 0 2.708940 0.921 0.347 0
## FEZ1 0 2.674523 0.783 0.181 0
## STMN4 0 2.622196 0.725 0.140 0
## APLP1 0 2.597775 0.784 0.155 0
## S100B 0 2.575254 0.934 0.492 0
## QDPR 0 2.543784 0.711 0.187 0
## GJB1 0 2.451394 0.557 0.062 0
## TF 0 2.439317 0.879 0.316 0
## STMN1 0 2.411306 0.802 0.243 0
## RAPGEF5 0 2.406229 0.661 0.107 0
## PMP22 0 2.397809 0.638 0.118 0
## KLK6 0 2.286000 0.470 0.044 0
## CLDND1 0 2.261389 0.725 0.153 0
## SPP1 0 2.253051 0.913 0.344 0
MAG, CNP, ERMN -> Oligodendrocytes
It seems that they are all oligodendrocytes
Let’s see the genes making the difference between those two clusters: Genes overexpressed in cluster 0 vs cluster 2 and 3
cluster0vs23.markers <- FindMarkers(sn.11.5, ident.1 = 0, ident.2 = c(2, 3), min.pct = 0.25, test.use = "wilcox")
cluster0vs23.markers <- cluster0vs23.markers[order(-cluster0vs23.markers$avg_log2FC),]
head(cluster0vs23.markers, n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## TUBB2B 9.811298e-18 1.568625 0.421 0.389 2.684764e-13
## LGALS1 8.818528e-16 1.523813 0.444 0.451 2.413102e-11
## MYLK 2.714519e-04 1.414340 0.382 0.481 1.000000e+00
## S100B 1.435508e-166 1.354370 0.937 0.932 3.928124e-162
## STMN4 6.619515e-38 1.341009 0.631 0.803 1.811364e-33
## MGST3 8.322499e-01 1.203809 0.357 0.536 1.000000e+00
## SCRG1 1.411455e-01 1.186089 0.261 0.386 1.000000e+00
## SERPINI1 5.000873e-02 1.143643 0.294 0.463 1.000000e+00
## STMN1 1.390251e-66 1.126726 0.740 0.853 3.804284e-62
## GPD1 5.717462e-04 1.097975 0.217 0.353 1.000000e+00
The main DE genes are linked to the cytoskeleton of the cell and to Ca2+ mediated response. S100B -> Small subset of OPC modulation of OPC synapsis. May act as precursors of AST (astrocyets) ’’’ astrocytic marker S100β, pointing to a role of S100β in the modulation of OPCs synapses or identifying a subset of cells that may act as precursors for astrocytes ’’’ cit
And overexpressed in cluster 2 vs cluster 0 and 3
cluster2vs03.markers <- FindMarkers(sn.11.5, ident.1 = 2, ident.2 = c(0, 3), min.pct = 0.25, test.use = "wilcox")
cluster2vs03.markers <- cluster2vs03.markers[order(-cluster2vs03.markers$avg_log2FC),]
head(cluster2vs03.markers, n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## GAPDH 1.918641e-187 1.4246237 0.840 0.178 5.250169e-183
## PRDX1 8.965040e-176 1.1911357 0.842 0.179 2.453194e-171
## KLK6 9.883987e-158 1.1897185 0.927 0.285 2.704654e-153
## DNAH17-AS1 1.422112e-103 1.1518760 0.421 0.062 3.891467e-99
## FTLP3 4.365625e-147 1.1409274 0.718 0.149 1.194610e-142
## GAPDHP1 6.460215e-152 1.0644227 0.794 0.182 1.767773e-147
## C2orf82 1.232992e-113 1.0113819 0.334 0.022 3.373960e-109
## MARCKSL1 2.964506e-119 1.0058306 0.992 0.517 8.112075e-115
## PTGDS 1.831073e-117 0.9735822 0.997 0.606 5.010548e-113
## AC008417.1 6.097955e-99 0.9121903 0.448 0.074 1.668644e-94
GAPDH -> glycolysis PRDX1 -> protection against oxidative stress KLK6 -> Degrades alpha-synuclein and prevents its polymerization
Satellaite Oligodendrocytes
And overexpressed in cluster 3 vs cluster 0 and 2
cluster3vs02.markers <- FindMarkers(sn.11.5, ident.1 = 3, ident.2 = c(0, 2), min.pct = 0.25, test.use = "wilcox")
cluster3vs02.markers <- cluster3vs02.markers[order(-cluster3vs02.markers$avg_log2FC),]
head(cluster3vs02.markers, n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## OPALIN 1.171272e-291 2.829309 0.754 0.068 3.205069e-287
## ITM2A 1.048422e-119 1.855625 0.384 0.041 2.868902e-115
## SUN2 2.766859e-51 1.403976 0.337 0.113 7.571234e-47
## HHIP 1.906966e-84 1.360213 0.545 0.195 5.218221e-80
## CD82 4.269736e-65 1.324387 0.283 0.052 1.168371e-60
## QDPR 1.951641e-138 1.312583 0.969 0.621 5.340469e-134
## ANK3 2.268497e-87 1.281525 0.638 0.274 6.207516e-83
## LINC00844 5.641154e-69 1.275117 0.552 0.225 1.543645e-64
## MID1IP1 7.822136e-64 1.249336 0.578 0.279 2.140449e-59
## KIAA0930 4.752932e-65 1.226123 0.404 0.124 1.300592e-60
OPALIN -> Promotes oligodendrocyte terminal differentiation ITM2A -> Angiogenesis HHIP -> Myelin sheet organization ANK3 -> Myelin sheath organization MMID1IP1 -> lipid biosynthesis
Perivascular Oligodendrocytes
cluster0AND2AND3AND6.markers <- FindMarkers(sn.11.5, ident.1 = c(0,2,3,6), min.pct = 0.25, test.use = "wilcox")
cluster0AND2AND3AND6.markers <- cluster0AND2AND3AND6.markers[order(-cluster0AND2AND3AND6.markers$avg_log2FC), ]
head(cluster0AND2AND3AND6.markers, n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## MBP 0 3.990324 0.962 0.275 0
## PLP1 0 3.713211 0.981 0.529 0
## CLDN11 0 3.629129 0.628 0.074 0
## MOBP 0 3.487979 0.775 0.099 0
## CNP 0 3.403986 0.841 0.222 0
## MAG 0 3.344376 0.746 0.119 0
## CRYAB 0 3.321436 0.907 0.362 0
## ERMN 0 3.300869 0.723 0.086 0
## APLP1 0 3.260704 0.719 0.097 0
## ST18 0 3.020800 0.575 0.040 0
They are all olygodendrocytes, so we need to further asses the subtype of cluster 6.
cluster6vs023.markers <- FindMarkers(sn.11.5, ident.1 = 6, ident.2 = c(0, 2, 3), min.pct = 0.25, test.use = "wilcox")
cluster6vs023.markers <- cluster6vs023.markers[order(-cluster6vs023.markers$avg_log2FC),]
head(cluster6vs023.markers, n = 10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## NEAT1 2.573288e-108 1.512712 0.870 0.698 7.041545e-104
## MTATP6P1 1.614773e-19 1.481639 0.407 0.310 4.418664e-15
## MT-ND4L 1.548508e-13 1.466584 0.326 0.243 4.237336e-09
## MIR219A2 7.189556e-37 1.454766 0.580 0.458 1.967350e-32
## MT-RNR2 1.453683e-92 1.453312 0.852 0.711 3.977858e-88
## CERCAM 6.247712e-23 1.428932 0.453 0.341 1.709624e-18
## WSB1 1.721366e-42 1.346586 0.626 0.497 4.710345e-38
## MAP4K4 3.728394e-18 1.340851 0.388 0.285 1.020238e-13
## AATK 1.073924e-08 1.305483 0.272 0.207 2.938686e-04
## ERBIN 1.417623e-05 1.255003 0.264 0.227 3.879184e-01
cluster 6 vs ALL CERCAM: Myelin sheath organization TTLL7: Myelin sheath organization TMEM165: Calcium/proton transporter involved in calcium and in lysosomal pH homeostasis. ATP8A1: Myelin sheath organization
Intrafascicular Oligodendrocytes
repeating the analysis also for clusters 1 and 8
cluster1AND8.markers <- FindMarkers(sn.11.5, ident.1 = c(1, 8), min.pct = 0.25, test.use = "wilcox")
cluster1AND8.markers <- cluster1AND8.markers[order(-cluster1AND8.markers$avg_log2FC), ]
head(cluster1AND8.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## SLC14A1 0 3.585351 0.602 0.026 0
## AQP1 0 3.354460 0.512 0.030 0
## SYNE1 0 3.304413 0.710 0.113 0
## FGFR3 0 3.222259 0.547 0.034 0
## IQCA1 0 3.188155 0.506 0.023 0
## AQP4 0 3.155195 0.532 0.034 0
## MT2A 0 3.025081 0.845 0.304 0
## GJA1 0 2.975764 0.522 0.041 0
## ACOT11 0 2.910918 0.499 0.041 0
## GFAP 0 2.903876 0.759 0.188 0
## SDC4 0 2.880833 0.387 0.019 0
## RGMA 0 2.800576 0.406 0.022 0
## RYR3 0 2.790579 0.424 0.027 0
## APOE 0 2.717603 0.456 0.051 0
## SLC6A11 0 2.716400 0.402 0.023 0
## MT1X 0 2.693098 0.531 0.090 0
## DST 0 2.684294 0.911 0.393 0
## AHCYL1 0 2.684251 0.582 0.102 0
## AGT 0 2.670918 0.446 0.043 0
## MT1E 0 2.664751 0.552 0.097 0
SLC14A1, AQP1, SYNE1 -> Astrocytes
And overexpressed in cluster 1 vs cluster 8
cluster1vs8.markers <- FindMarkers(sn.11.5, ident.1 = 1, ident.2 = 8, min.pct = 0.25, test.use = "wilcox")
cluster1vs8.markers <- cluster1vs8.markers[order(-cluster1vs8.markers$avg_log2FC),]
head(cluster1vs8.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## SLC6A11 2.706412e-11 1.1372593 0.445 0.263 7.405826e-07
## SLC1A2 5.288953e-09 1.0810690 0.453 0.313 1.447269e-04
## GLUL 9.236793e-09 0.9860687 0.454 0.310 2.527556e-04
## APOC1 1.179102e-05 0.9298295 0.259 0.145 3.226496e-01
## NDRG2 2.894900e-12 0.8594817 0.702 0.616 7.921605e-08
## MIAT 3.441118e-03 0.7497609 0.320 0.259 1.000000e+00
## ACSS1 1.013476e-03 0.7398942 0.343 0.269 1.000000e+00
## SFXN5 2.990610e-02 0.6297845 0.337 0.306 1.000000e+00
## TTYH1 2.247265e-03 0.5741342 0.504 0.485 1.000000e+00
## CHD9 6.123444e-02 0.5734169 0.331 0.320 1.000000e+00
## LRP1B 3.017607e-02 0.5599995 0.263 0.215 1.000000e+00
## NTRK3 7.224036e-02 0.5357667 0.291 0.263 1.000000e+00
## KIAA0930 4.765375e-04 0.5270372 0.589 0.552 1.000000e+00
## MACF1 1.993998e-08 0.5215897 0.805 0.818 5.456377e-04
## MT-ND2 2.926346e-01 0.5186817 0.300 0.293 1.000000e+00
## BAZ2B 1.361101e-01 0.5130906 0.357 0.350 1.000000e+00
## NRXN1 1.035823e-01 0.4944489 0.261 0.232 1.000000e+00
## SLC1A3 7.389025e-03 0.4711984 0.481 0.461 1.000000e+00
## HIF3A 1.139567e-01 0.4705551 0.275 0.249 1.000000e+00
## FGFR3 1.630241e-02 0.4669772 0.547 0.545 1.000000e+00
GLUL (Glutamate-ammonia ligase): regulates the levels of toxic ammonia and converts neurotoxic glutamate to harmless glutamine SLC6A11: Terminates the action of GABA by its high affinity sodium-dependent reuptake into presynaptic terminals. Astrocytes type 2 (they promotes neuron survival and repair)
And overexpressed in cluster 8 vs cluster 1
cluster8vs1.markers <- FindMarkers(sn.11.5, ident.1 = 8, ident.2 = 1, min.pct = 0.25, test.use = "wilcox")
cluster8vs1.markers <- cluster8vs1.markers[order(-cluster8vs1.markers$avg_log2FC),]
head(cluster8vs1.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## ID3 3.291705e-40 1.9142349 0.374 0.066 9.007422e-36
## CPAMD8 2.275972e-44 1.9020350 0.290 0.025 6.227969e-40
## DPP10 4.358531e-52 1.7937343 0.394 0.049 1.192669e-47
## CD44 7.800680e-44 1.7148875 0.333 0.042 2.134578e-39
## MT1G 4.438674e-29 1.5739861 0.556 0.233 1.214599e-24
## COL21A1 5.325529e-34 1.5414487 0.306 0.051 1.457278e-29
## NUPR1 2.847955e-27 1.3163335 0.354 0.090 7.793144e-23
## AEBP1 8.383475e-33 1.2583071 0.609 0.233 2.294054e-28
## S100A1 2.205008e-26 1.1568562 0.549 0.216 6.033784e-22
## MT1F 1.361855e-24 1.1533683 0.434 0.144 3.726580e-20
## TJP2 7.984248e-22 1.1452740 0.279 0.069 2.184810e-17
## RPL11 1.518732e-26 1.0243270 0.502 0.172 4.155857e-22
## MT2P1 2.206938e-26 1.0079400 0.697 0.355 6.039066e-22
## NFASC 2.330822e-21 0.8956373 0.404 0.134 6.378060e-17
## TTN 9.071215e-24 0.8837213 0.667 0.309 2.482247e-19
## ITGB4 1.269255e-18 0.8779393 0.461 0.189 3.473189e-14
## CRYAB 4.352687e-16 0.8480805 0.667 0.385 1.191069e-11
## LINC01094 5.620849e-14 0.8424834 0.306 0.115 1.538089e-09
## CTSH 1.931727e-12 0.8331118 0.320 0.136 5.285977e-08
## C2orf82 5.096135e-16 0.8218179 0.273 0.084 1.394506e-11
DPP10 -> Modulates the activity and gating characteristics of the potassium channel KCND2 NDRG2 -> Contributes to the regulation of the Wnt signaling pathway. Down-regulates CTNNB1-mediated transcriptional activation of target genes, such as CCND1, and may thereby act as tumor suppressor.
'GFAP' %in% row.names(cluster8vs1.markers[cluster8vs1.markers$avg_log2FC > 0,])
## [1] TRUE
GFAP is a know marker for Astrocytes of type 1
Astrocytes type 1 (they promote death and toxiciy)
Finally repeating the analysis for clusters 5 and 10
cluster10AND5.markers <- FindMarkers(sn.11.5, ident.1 = c(10, 5), min.pct = 0.25, test.use = "wilcox")
cluster10AND5.markers <- cluster10AND5.markers[order(-cluster10AND5.markers$avg_log2FC), ]
head(cluster10AND5.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## RNA5SP151 0.000000e+00 4.124945 0.515 0.012 0.000000e+00
## LPAR6 0.000000e+00 4.104635 0.596 0.036 0.000000e+00
## CD74 0.000000e+00 4.095964 0.613 0.049 0.000000e+00
## CSF3R 0.000000e+00 4.069473 0.525 0.009 0.000000e+00
## KCNMB1 0.000000e+00 3.960295 0.472 0.013 0.000000e+00
## ITGAX 0.000000e+00 3.865904 0.463 0.007 0.000000e+00
## ADAM28 0.000000e+00 3.797975 0.469 0.009 0.000000e+00
## CSF1R 0.000000e+00 3.758224 0.433 0.007 0.000000e+00
## MED12L 0.000000e+00 3.522321 0.443 0.025 0.000000e+00
## SAT1 0.000000e+00 3.521831 0.786 0.198 0.000000e+00
## P2RY13 0.000000e+00 3.415369 0.342 0.010 0.000000e+00
## RNASET2 0.000000e+00 3.394445 0.405 0.022 0.000000e+00
## SFMBT2 2.220316e-206 3.368780 0.336 0.036 6.075672e-202
## HLA-DRA 8.698232e-297 3.314093 0.332 0.015 2.380184e-292
## MAF 1.981347e-277 3.272968 0.401 0.036 5.421759e-273
## CX3CR1 0.000000e+00 3.267177 0.322 0.005 0.000000e+00
## SRGAP2 1.142723e-236 3.192989 0.377 0.039 3.126948e-232
## CLEC7A 0.000000e+00 3.172228 0.307 0.007 0.000000e+00
## C3 3.053651e-295 3.171522 0.356 0.020 8.356011e-291
## C1QB 5.990104e-306 3.132509 0.315 0.011 1.639132e-301
LPAR6, ITGAX, ADAM28-> Microglia
Genes overexpressed in cluster 10 vs cluster 5
cluster10vs5.markers <- FindMarkers(sn.11.5, ident.1 = 10, ident.2 = 5, min.pct = 0.25, test.use = "wilcox")
cluster10vs5.markers <- cluster10vs5.markers[order(-cluster10vs5.markers$avg_log2FC),]
head(cluster10vs5.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## DST 1.115462e-56 3.189506 0.96 0.104 3.052350e-52
## FGFR3 1.670328e-86 3.081269 0.68 0.008 4.570687e-82
## SLC14A1 2.867634e-78 2.995867 0.64 0.009 7.846993e-74
## IQCA1 4.820062e-70 2.978803 0.60 0.011 1.318962e-65
## SYNE1 5.320494e-61 2.973614 0.66 0.026 1.455900e-56
## MT2A 2.121843e-45 2.905028 0.90 0.121 5.806210e-41
## MT1E 1.942050e-60 2.802417 0.60 0.019 5.314225e-56
## ACOT11 2.071802e-56 2.688174 0.56 0.017 5.669279e-52
## C1orf61 2.849189e-54 2.686989 0.84 0.068 7.796519e-50
## PLXNB1 1.852518e-49 2.609471 0.56 0.025 5.069230e-45
## RYR3 2.800696e-59 2.558244 0.54 0.012 7.663824e-55
## MACF1 7.227341e-37 2.486081 0.84 0.123 1.977690e-32
## AQP4 1.175331e-41 2.456832 0.52 0.028 3.216175e-37
## NDRG2 1.156062e-34 2.439602 0.78 0.106 3.163449e-30
## NTM 1.936312e-58 2.419858 0.52 0.011 5.298525e-54
## COL16A1 1.507053e-47 2.395279 0.48 0.016 4.123899e-43
## LGI4 5.841375e-44 2.392608 0.40 0.009 1.598434e-39
## SDC4 4.612178e-46 2.363473 0.40 0.008 1.262076e-41
## GJA1 1.346061e-50 2.342807 0.60 0.028 3.683362e-46
## NTRK2 3.297522e-57 2.340333 0.58 0.019 9.023339e-53
FGFR3 -> Apoptosis IQCA1 -> Motility
And overexpressed in cluster 5 vs cluster 10
cluster5vs10.markers <- FindMarkers(sn.11.5, ident.1 = 5, ident.2 = 10, min.pct = 0.25, test.use = "wilcox")
cluster5vs10.markers <- cluster5vs10.markers[order(-cluster5vs10.markers$avg_log2FC),]
head(cluster5vs10.markers, n = 20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## CD74 3.333088e-05 1.669590 0.616 0.58 9.120661e-01
## MEF2A 3.557113e-02 1.629902 0.314 0.24 1.000000e+00
## RPS18 8.398421e-03 1.600621 0.339 0.22 1.000000e+00
## SLCO2B1 1.060275e-01 1.492347 0.299 0.26 1.000000e+00
## RPS14 3.835965e-02 1.491329 0.286 0.20 1.000000e+00
## FMNL3 2.172339e-02 1.477008 0.342 0.26 1.000000e+00
## RPL10 2.573786e-02 1.428086 0.412 0.40 1.000000e+00
## C1QB 5.454442e-02 1.426445 0.319 0.26 1.000000e+00
## MAF 4.614209e-02 1.414126 0.401 0.40 1.000000e+00
## TMSB4X 1.280326e-11 1.379628 0.899 0.92 3.503484e-07
## B2M 1.786634e-03 1.377178 0.526 0.48 1.000000e+00
## RPLP2 1.573563e-01 1.351168 0.328 0.32 1.000000e+00
## RPL15 1.165688e-01 1.348350 0.283 0.24 1.000000e+00
## RPL37A 1.943178e-01 1.346073 0.285 0.26 1.000000e+00
## RPL3 9.163406e-02 1.312962 0.386 0.40 1.000000e+00
## MEF2C 1.039114e-01 1.303985 0.386 0.42 1.000000e+00
## RPLP1 1.384900e-01 1.298989 0.365 0.38 1.000000e+00
## ADAP2 9.214586e-02 1.273756 0.266 0.20 1.000000e+00
## SFMBT2 4.265497e-02 1.265804 0.342 0.26 1.000000e+00
## SERPINB9 5.897223e-02 1.259930 0.306 0.22 1.000000e+00
CD74 -> Immune response (MHC complex) MEF2A -> phosphorylated and sumoylated MEF2A represses transcription of NUR77 promoting synaptic differentiation
Microglia supporting neurogenesis
Interestingly MEF2A and CD74 (and many more) are highly correlated with cluster 5! So it may be possibile that cluster 10 is a ‘bad’ cluster. (not meaningful)
for (feature in c("S100B", "SLC6A11", "GAPDH", "OPALIN", "PTPRZ1", "MEF2A", "CERCAM", "RGS5", 'GFAP', 'SYT1', 'CD74')){
p <- FeaturePlot(sn.11.5, features = feature, repel = T)
plot(p)
}
Or with a violin plot
VlnPlot(sn.11.5, features = "S100B")
VlnPlot(sn.11.5, features = "SLC6A11")
VlnPlot(sn.11.5, features = "GAPDH")
VlnPlot(sn.11.5, features = "OPALIN")
VlnPlot(sn.11.5, features = "PTPRZ1")
VlnPlot(sn.11.5, features = "MEF2A")
VlnPlot(sn.11.5, features = "CERCAM")
VlnPlot(sn.11.5, features = "RGS5")
VlnPlot(sn.11.5, features = "GFAP")
VlnPlot(sn.11.5, features = "SYT1")
VlnPlot(sn.11.5, features = "CD74")
We can visualize all the marker genes and their expression (CPM) in each cluster using a dot plot.
library(ggrepel)
DotPlot(sn.11.5, features = c("S100B", "SLC6A11", "GAPDH", "OPALIN", "PTPRZ1", "MEF2A", "CERCAM", "RGS5", 'GFAP', 'SYT1', 'CD74')) + theme(axis.text.x = element_text(angle = 90))
Using marker genes to infere the subtypes of each cluster was quite difficult and needed a deep knowlegde of the fild (which in our case is limited). Eventually we were able to identify 10 different cells types of which 4 subtypes of ODC, 2 subtypes of AST and Microglia, Neuros, OPC ad Endothelial cells.
Comparing the results with the automated pipeline employed by Panglao, we were able to label and cluster the cells with unknown cell type. Also we did not found Pancreatich stellate cells and Pericytes, may be due to different clustering parameters.
new.cluster.ids <- c("OPC AST-precursors", "AST-2", "Satellite ODC", "Perivascular ODC", "OPC", "Microglia", "Intrafascicular ODC", "Endothelial", 'AST-1', 'Neurons', 'Microglia')
names(new.cluster.ids) <- levels(sn.11.5)
sn.11.5 <- RenameIdents(sn.11.5, new.cluster.ids)
DimPlot(sn.11.5, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()